********************************************************************************
* JOIN LISTINGS WITH NEW BUILDINGS

program define geocode_listings

{

* load zillow data
use unique_listings.dta,clear

* merge in tract characteristics
merge m:1 GEOID using all_tracts_2017, keepus(rent_2br med_hh_inc inc_pc pct_black pct_white pct_college pct_owner pct_vacant pct_poverty pct_200k)
rename (rent_2br med_hh_inc inc_pc pct_black pct_white pct_college pct_owner pct_vacant pct_poverty pct_200k) ///
	(rent_2br_list med_hh_inc_list inc_pc_list pct_black_list pct_white_list pct_college_list pct_owner_list per_vac_list ///
	poverty_rate_list pct_200k_list)
rename _merge tract_char_merge
drop if tract_char_merge==2

replace met="atlanta" if met=="Atlanta, GA"
replace met="austin" if met=="Austin, TX"
replace met="chicago" if strpos(met, "Chicago")>0
replace met="dc" if strpos(met, "Washington")>0
replace met="denver" if strpos(met, "Denver")>0
replace met="la" if strpos(met, "Los Angeles")>0
replace met="brooklyn" if strpos(met, "New York")>0
replace met="philadelphia" if strpos(met, "Philadelphia")>0
replace met="portland" if strpos(met, "Portland")>0
replace met="sf" if strpos(met, "San Fran")>0
replace met="seattle" if strpos(met, "Seattle")>0

save geocoded_listings, replace

* expand data to listing building pairs.  Do this in small chunks
* to avoid stretching RAM capacity

local met_list atlanta austin chicago dc denver la brooklyn philadelphia portland sf seattle 
foreach city in `met_list'{

	use geocoded_listings, clear
	keep if met=="`city'"
	gen bcutter=runiform()
	save `city'_cutter, replace

	forvalues i = 1/10{

		local j=`i'-1
		
		use `city'_cutter, clear
		gen ll=`j'*.1
		gen ul=`i'*.1
		keep if inrange(bcutter,ll,ul)==1
		
		joinby met using short_apartments
		geodist listing_lat listing_lon latitude longitude, gen(alt_dist)
		keep if alt_dist<1
		save `city'_listing_tracts_`i'.dta , replace	

	}

	use `city'_listing_tracts_1,clear

	forvalues i = 2/10{

		append using `city'_listing_tracts_`i'

	}
	save `city'_listings_tracts, replace

}

* append it back together into one data set
use atlanta_listings_tracts, clear

local met_list austin chicago dc denver la brooklyn philadelphia portland sf seattle
foreach city in `met_list'{

	append using `city'_listings_tracts

}
 
save "listing_tracts.dta" , replace

}

end

********************************************************************************
* ADD INFORMATION ON THE PAIRED BUILDINGS AND ORGANIZE

program define add_building_info

{

use listing_tracts,clear

* merge in building info
merge m:1 id using apartment_distances
drop if _merge==2
drop _merge

rename GEOID listing_tract
merge m:1 id using rca_characteristics
drop if _merge==2
drop _merge

merge m:1 id using rca_cleaned
drop if _merge==2
drop _merge

* identify high student buildings
rename building_tract GEOID
merge m:1 GEOID using all_tracts_2017,keepus(pct_students)
keep if _merge==3
drop _merge
rename pct_students building_pct_students
rename GEOID building_tract

* make time variables
gen listing_date=date(listingdate,"YMD")
drop listingdate
drop met
drop bcutter ll ul

gen listing_month=month(listing_date)
gen listing_year=year(listing_date)
gen listing_quarter=floor((listing_month-1)/3)+1
gen listing_half=floor((listing_month-1)/6)+1

* clean and rename listing variables
rename alt_dist dist
rename hpid listing_id
rename bedroomcnt bedrooms
rename bathroomcnt bathrooms
rename leasepricedollarcnt rent
rename *_list listing_*
rename tract_char_merge listing_tract_merge
rename (bedrooms bathrooms) (listing_bedrooms listing_bathrooms)
rename finished listing_sqft

order listing* 
order msa
order rent dist listing_bedrooms listing_sqft listing_bathrooms , after(listing_id)
order building_built building_lat building_lon, after(building_id)
order listing_id msa listing_year 

replace dist=dist*1000
save zillow_analysis_file_long, replace


}

end


********************************************************************************
* CREATE NEAR-FAR FILES

program define near_far_file

{

use zillow_analysis_file_long,clear
gen lrent=log(rent)

* keep only the observations close to a 2015/2016 building, and only the closest
* building if there are multiple
keep if building_built==2015 | building_built==2016 
bys listing_id listing_date: egen double min_dist=min(dist)
keep if min_dist==dist

* winsorize rents and create bedroom/bathroom bins
replace listing_bedrooms=3 if listing_bedrooms>3
replace listing_bathrooms=3 if listing_bathrooms>3
replace listing_bathrooms=ceil(listing_bathrooms)
winsor2 rent lrent, cuts(1 99) replace

* get rid of observations outside of max distance
drop if dist>$max_dist

* get rid of 2012 listings, which are too sparse to use with year FE
drop if listing_year==2012

* make years-to-treatment dummies for both 250m and 400m treatment radius
gen l250=(dist<250)
gen l400=(dist<400)

gen gap=listing_year-building_built

forvalues i=1(1)7{

	gen tx_`i'=0
	gen tx_n`i'=0
	
	gen tx4_`i'=0
	gen tx4_n`i'=0
	
	replace tx_`i'=1 if gap==`i' & l250==1 
	replace tx_n`i'=1 if gap==-`i' & l250==1 

	replace tx4_`i'=1 if gap==`i' & l400==1 
	replace tx4_n`i'=1 if gap==-`i' & l400==1 

}

gen tx_0=0
replace tx_0=1 if gap==0 & l250==1 

gen tx4_0=0
replace tx4_0=1 if gap==0 & l400==1

* bin dummy end points
replace tx_n3=1 if gap<-3 & l250==1
replace tx_3=1 if gap>3 & l250==1
replace tx4_n3=1 if gap<-3 & l400==1
replace tx4_3=1 if gap>3 & l400==1

* create building id time trend
egen building_id_year=group(building_id listing_year)

* use msa median income to create gentrification definition
gen building_income_g=0
replace building_income_g = 1 if msa=="atlanta" & building_med_hh_inc<66000
replace building_income_g = 1 if msa=="austin" & building_med_hh_inc<74000
replace building_income_g = 1 if msa=="chicago" & building_med_hh_inc<69000
replace building_income_g = 1 if msa=="denver" & building_med_hh_inc<77000
replace building_income_g = 1 if msa=="la" & building_med_hh_inc<70000
replace building_income_g = 1 if msa=="brooklyn" & building_med_hh_inc<76000
replace building_income_g = 1 if msa=="philadelphia" & building_med_hh_inc<69000
replace building_income_g = 1 if msa=="portland" & building_med_hh_inc<72000
replace building_income_g = 1 if msa=="sf" & building_med_hh_inc<96000
replace building_income_g = 1 if msa=="seattle" & building_med_hh_inc<83000
replace building_income_g = 1 if msa=="dc" & building_med_hh_inc<83000

* construct sample indicator, weights, and indicators for whether buildings 
* have any nearby listings

if ("$g"=="g"){

	gen building_sample=(inlist(building_built, 2015, 2016)>0 ///
		& building_units>=50 & dist>20 & building_income_g==1 ///
		& listing_year>2012 & b_nearby_${iso_dist}_built==0 ///
		& building_pct_student<.25)
		
}

if ("$g"=="ng"){

	gen building_sample=(inlist(building_built, 2015, 2016)>0 ///
		& building_units>=50 & dist>20 ///
		& listing_year>2012 & b_nearby_${iso_dist}_built==0 ///
		& building_pct_student<.25)
		
}

bys building_id listing_year: egen building_sample_weight_d=sum(building_sample)
gen building_sample_weight=1/building_sample_weight_d	

forvalues i=2013/2018{

	bys building_id: egen number_in_`i'=sum(listing_year==`i' & building_sample==1)
	
}

egen min_number=rowmin(number_in_*)

	
forvalues i=2013/2018{

	bys building_id: egen tx250_number_in_`i'=sum(listing_year==`i' & building_sample==1 & l250==1)
	
}

egen tx250_min_number=rowmin(tx250_number_in_*)

forvalues i=2013/2018{

	bys building_id: egen tx256_number_in_`i'=sum(listing_year==`i' & building_sample==1  & inrange(dist,250,$max_dist)>0)
	
}

egen tx256_min_number=rowmin(tx256_number_in_*)

forvalues i=2013/2018{

	bys building_id: egen tx400_number_in_`i'=sum(listing_year==`i' & building_sample==1 & l400==1)
	
}

egen tx400_min_number=rowmin(tx400_number_in_*)

forvalues i=2013/2018{

	bys building_id: egen tx46_number_in_`i'=sum(listing_year==`i' & building_sample==1 & inrange(dist,400,$max_dist)>0)
	
}

egen tx46_min_number=rowmin(tx46_number_in_*)

save near_far_m${max_dist}_i${iso_dist}_${g}, replace
 
 }
 
end


********************************************************************************
* CREATE NEAR-NEAR FILES

program define near_near_file

{

use zillow_analysis_file_long,clear
gen lrent=log(rent)

* keep only the observations close to a 2015/2016/2019 building, and only the closest
* building if there are multiple
keep if inlist(building_built,2015,2016,2019)>0
bys listing_id listing_date: egen double min_dist=min(dist)
keep if min_dist==dist

* get rid of observations outside of max distance
keep if dist<$max_dist

* get rid of 2012 listings, which are too sparse to use with year FE
drop if listing_year==2012

* winsorize rents and create bedroom/bathroom bins
replace listing_bedrooms=3 if listing_bedrooms>3
replace listing_bathrooms=3 if listing_bathrooms>3
replace listing_bathrooms=ceil(listing_bathrooms)
winsor2 rent lrent, cuts(1 99) replace

* make years-to-treatment dummies
gen sample=(inrange(building_built,2015,2016)>0)
gen gap=listing_year-building_built

forvalues i=1(1)3{

	gen tx_`i'=0
	gen tx_n`i'=0
	replace tx_`i'=1 if gap==`i' & sample==1 
	replace tx_n`i'=1 if gap==-`i' & sample==1 
	
}

gen tx_0=0
replace tx_0=1 if gap==0 & sample==1

* bin end points and make other useful variables
replace tx_n3 = 1 if gap<-3 & sample==1
replace tx_3 = 1 if gap>3 & sample==1
gen zero=0
gen treated=sample
gen treated_after=treated*(listing_year>=building_built)

* generate time trend variable
egen msa_year=group(msa listing_year)

* gentrification indicator
gen building_income_g=0
replace building_income_g = 1 if msa=="atlanta" & building_med_hh_inc<66000
replace building_income_g = 1 if msa=="austin" & building_med_hh_inc<74000
replace building_income_g = 1 if msa=="chicago" & building_med_hh_inc<69000
replace building_income_g = 1 if msa=="denver" & building_med_hh_inc<77000
replace building_income_g = 1 if msa=="la" & building_med_hh_inc<70000
replace building_income_g = 1 if msa=="brooklyn" & building_med_hh_inc<76000
replace building_income_g = 1 if msa=="philadelphia" & building_med_hh_inc<69000
replace building_income_g = 1 if msa=="portland" & building_med_hh_inc<72000
replace building_income_g = 1 if msa=="sf" & building_med_hh_inc<96000
replace building_income_g = 1 if msa=="seattle" & building_med_hh_inc<83000
replace building_income_g = 1 if msa=="dc" & building_med_hh_inc<83000

* construct sample indicator, weights, and indicators for whether buildings 
* have any nearby listings

if ("$g"=="g"){

	gen building_sample=(inlist(building_built, 2015, 2016,2019)>0 ///
		& building_units>=50 & dist>20 & building_income_g==1 ///
		& listing_year>2012 & b_nearby_${iso_dist}_built==0 ///
		& building_pct_student<.25)
		
}

if ("$g"=="ng"){

	gen building_sample=(inlist(building_built, 2015, 2016,2019)>0 ///
		& building_units>=50 & dist>20 ///
		& listing_year>2012 & b_nearby_${iso_dist}_built==0 ///
		& building_pct_student<.25)
		
}

bys building_id listing_year: egen building_sample_weight_d=sum(building_sample)
gen building_sample_weight=1/building_sample_weight_d	

forvalues i=2013/2018{

	bys building_id: egen number_in_`i'=sum(listing_year==`i' & building_sample==1)
	
}

egen min_number=rowmin(number_in_*)

* drop msas where we don't have placebo buildings
drop if inlist(msa, "dc", "sf")>0

save near_near_m${max_dist}_i${iso_dist}_${g}, replace

}

end


*******************************************************************************
* CREATE DDD FILES

program define triple_diff_file

{

use zillow_analysis_file_long,clear
gen lrent=log(rent)

* keep only the observations close to a 2015/2016/2019 building, and only the closest
* building if there are multiple
keep if inlist(building_built,2015,2016,2019)>0
bys listing_id listing_date: egen double min_dist=min(dist)
keep if min_dist==dist

* get rid of observations outside of max distance
keep if dist<$max_dist

* get rid of 2012 listings, which are too sparse to use with year FE
drop if listing_year==2012

* winsorize rents and create bedroom/bathroom bins
replace listing_bedrooms=3 if listing_bedrooms>3
replace listing_bathrooms=3 if listing_bathrooms>3
replace listing_bathrooms=ceil(listing_bathrooms)
winsor2 rent lrent, cuts(1 99) replace

* ddd variables
gen treated=(inrange(building_built,2015,2016)>0)

gen l250=(dist<250)
gen l400=(dist<400)

gen after=(listing_year>=building_built)
replace after = 1 if listing_year>2014 & treated==0
gen after_250_treated=(treated==1 & l250==1 & listing_year>=building_built)

* make time trend variable
egen building_id_year=group(building_id listing_year)


* gentrification defs
gen building_income_g=0
replace building_income_g = 1 if msa=="atlanta" & building_med_hh_inc<66000
replace building_income_g = 1 if msa=="austin" & building_med_hh_inc<74000
replace building_income_g = 1 if msa=="chicago" & building_med_hh_inc<69000
replace building_income_g = 1 if msa=="denver" & building_med_hh_inc<77000
replace building_income_g = 1 if msa=="la" & building_med_hh_inc<70000
replace building_income_g = 1 if msa=="brooklyn" & building_med_hh_inc<76000
replace building_income_g = 1 if msa=="philadelphia" & building_med_hh_inc<69000
replace building_income_g = 1 if msa=="portland" & building_med_hh_inc<72000
replace building_income_g = 1 if msa=="sf" & building_med_hh_inc<96000
replace building_income_g = 1 if msa=="seattle" & building_med_hh_inc<83000
replace building_income_g = 1 if msa=="dc" & building_med_hh_inc<83000

* construct sample indicator, weights, and indicators for whether buildings 
* have any nearby listings
if ("$g"=="g"){

	gen building_sample=(inlist(building_built, 2015, 2016,2019)>0 ///
		& building_units>=50 & dist>20 & building_income_g==1 ///
		& listing_year>2012 & b_nearby_${iso_dist}_built==0 ///
		& building_pct_student<.25)
		
}

if ("$g"=="ng"){

	gen building_sample=(inlist(building_built, 2015, 2016,2019)>0 ///
		& building_units>=50 & dist>20 ///
		& listing_year>2012 & b_nearby_${iso_dist}_built==0 ///
		& building_pct_student<.25)
		
}
	

bys building_id listing_year: egen building_sample_weight_d=sum(building_sample)
gen building_sample_weight=1/building_sample_weight_d	

* sample inclusion

forvalues i=2013/2018{

	bys building_id: egen number_in_`i'=sum(listing_year==`i' & building_sample==1)
	
}

egen min_number=rowmin(number_in_*)

	
forvalues i=2013/2018{

	bys building_id: egen tx250_number_in_`i'=sum(listing_year==`i' & building_sample==1 & l250==1)
	
}

egen tx250_min_number=rowmin(tx250_number_in_*)

forvalues i=2013/2018{

	bys building_id: egen tx256_number_in_`i'=sum(listing_year==`i' & building_sample==1  & inrange(dist,250,$max_dist)>0)
	
}

egen tx256_min_number=rowmin(tx256_number_in_*)

forvalues i=2013/2018{

	bys building_id: egen tx400_number_in_`i'=sum(listing_year==`i' & building_sample==1 & l400==1)
	
}

egen tx400_min_number=rowmin(tx400_number_in_*)

forvalues i=2013/2018{

	bys building_id: egen tx46_number_in_`i'=sum(listing_year==`i' & building_sample==1 & inrange(dist,400,$max_dist)>0)
	
}

egen tx46_min_number=rowmin(tx46_number_in_*)

* drop msas where we don't have placebo buildings
drop if inlist(msa, "dc", "sf")>0

save triple_diff_m${max_dist}_i${iso_dist}_${g}, replace

}

end


*******************************************************************************
* CREATE FAR-FAR FILE

program define far_far_file

{

use zillow_analysis_file_long,clear
gen lrent=log(rent)

* keep only the observations close to a 2015/2016/2019 building, and only the closest
* building if there are multiple
keep if inlist(building_built,2015,2016,2019)>0
bys listing_id listing_date: egen double min_dist=min(dist)
keep if min_dist==dist

* get rid of observations outside of max distance or inside treatment area
keep if dist>250
keep if dist<$max_dist

* get rid of 2012 listings, which are too sparse to use with year FE
drop if listing_year==2012

* winsorize rents and create bedroom/bathroom bins
replace listing_bedrooms=3 if listing_bedrooms>3
replace listing_bathrooms=3 if listing_bathrooms>3
replace listing_bathrooms=ceil(listing_bathrooms)
winsor2 rent lrent, cuts(1 99) replace

* dd variables
gen treated=(inrange(building_built,2015,2016)>0)
gen treated_after=treated*(listing_year>=building_built)

* time trend variable
egen msa_year=group(msa listing_year)

* gentrification definitions
gen building_income_g=0
replace building_income_g = 1 if msa=="atlanta" & building_med_hh_inc<66000
replace building_income_g = 1 if msa=="austin" & building_med_hh_inc<74000
replace building_income_g = 1 if msa=="chicago" & building_med_hh_inc<69000
replace building_income_g = 1 if msa=="denver" & building_med_hh_inc<77000
replace building_income_g = 1 if msa=="la" & building_med_hh_inc<70000
replace building_income_g = 1 if msa=="brooklyn" & building_med_hh_inc<76000
replace building_income_g = 1 if msa=="philadelphia" & building_med_hh_inc<69000
replace building_income_g = 1 if msa=="portland" & building_med_hh_inc<72000
replace building_income_g = 1 if msa=="sf" & building_med_hh_inc<96000
replace building_income_g = 1 if msa=="seattle" & building_med_hh_inc<83000
replace building_income_g = 1 if msa=="dc" & building_med_hh_inc<83000

* construct sample indicator, weights, and indicators for whether buildings 
* have any nearby listings
if ("$g"=="g"){

	gen building_sample=(inlist(building_built, 2015, 2016,2019)>0 ///
		& building_units>=50 & dist>20 & building_income_g==1 ///
		& listing_year>2012 & b_nearby_${iso_dist}_built==0 ///
		& building_pct_student<.25)
		
}

if ("$g"=="ng"){

	gen building_sample=(inlist(building_built, 2015, 2016,2019)>0 ///
		& building_units>=50 & dist>20 ///
		& listing_year>2012 & b_nearby_${iso_dist}_built==0 ///
		& building_pct_student<.25)
		
}
	
bys building_id listing_year: egen building_sample_weight_d=sum(building_sample)
gen building_sample_weight=1/building_sample_weight_d	


forvalues i=2013/2018{

	bys building_id: egen number_in_`i'=sum(listing_year==`i' & building_sample==1)
	
}

egen min_number=rowmin(number_in_*)

* drop msas where we don't have placebo buildings
drop if inlist(msa, "dc", "sf")>0

save far_far_m${max_dist}_i${iso_dist}_${g}, replace

}

end


********************************************************************************
* CREATE NEAR-FAR FILE FOR LONG PRETREND ROBUSTNESS CHECK

program define long_pretrend_file

{

use zillow_analysis_file_long,clear
gen lrent=log(rent)

* keep only the observations close to a 2019 building, and only the closest
* building if there are multiple
keep if building_built==2019
bys listing_id listing_date: egen double min_dist=min(dist)
keep if min_dist==dist

* winsorize rents and create bedroom/bathroom bins
replace listing_bedrooms=3 if listing_bedrooms>3
replace listing_bathrooms=3 if listing_bathrooms>3
replace listing_bathrooms=ceil(listing_bathrooms)
winsor2 rent lrent, cuts(1 99) replace

* get rid of observations outside of max distance
drop if dist>$max_dist

* get rid of 2012 listings, which are too sparse to use with year FE
drop if listing_year==2012

* make years-to-treatment dummies for both 250m and 400m treatment radius
gen l250=(dist<250)
gen l400=(dist<400)

gen gap=listing_year-building_built

forvalues i = 2013(1)2018{

	gen txy_`i'=l250*(listing_year==`i')

}

* make  time trend variable
egen building_id_year=group(building_id listing_year)

* gentrification definition
gen building_income_g=0
replace building_income_g = 1 if msa=="atlanta" & building_med_hh_inc<66000
replace building_income_g = 1 if msa=="austin" & building_med_hh_inc<74000
replace building_income_g = 1 if msa=="chicago" & building_med_hh_inc<69000
replace building_income_g = 1 if msa=="denver" & building_med_hh_inc<77000
replace building_income_g = 1 if msa=="la" & building_med_hh_inc<70000
replace building_income_g = 1 if msa=="brooklyn" & building_med_hh_inc<76000
replace building_income_g = 1 if msa=="philadelphia" & building_med_hh_inc<69000
replace building_income_g = 1 if msa=="portland" & building_med_hh_inc<72000
replace building_income_g = 1 if msa=="sf" & building_med_hh_inc<96000
replace building_income_g = 1 if msa=="seattle" & building_med_hh_inc<83000
replace building_income_g = 1 if msa=="dc" & building_med_hh_inc<83000

* construct sample indicator, weights, and indicators for whether buildings 
* have any nearby listings
gen building_sample=(inlist(building_built, 2019)>0 ///
	& building_units>=50 & dist>20 & building_income_g==1 ///
	& listing_year>2012 & b_nearby_${iso_dist}_built==0 ///
	& building_pct_student<.25)
		
bys building_id listing_year: egen building_sample_weight_d=sum(building_sample)
gen building_sample_weight=1/building_sample_weight_d	

forvalues i=2013/2018{

	bys building_id: egen number_in_`i'=sum(listing_year==`i' & building_sample==1)
	
}

egen min_number=rowmin(number_in_*)

	
forvalues i=2013/2018{

	bys building_id: egen tx250_number_in_`i'=sum(listing_year==`i' & building_sample==1 & l250==1)
	
}

egen tx250_min_number=rowmin(tx250_number_in_*)

forvalues i=2013/2018{

	bys building_id: egen tx256_number_in_`i'=sum(listing_year==`i' & building_sample==1  & inrange(dist,250,$max_dist)>0)
	
}

egen tx256_min_number=rowmin(tx256_number_in_*)

save lpt_m${max_dist}_i${iso_dist}_${g}, replace
 
 }
 
end

********************************************************************************
* CREATE FILE FOR LONG RUN NEAR FAR EFFECT

program define long_run_effect_file

{

use zillow_analysis_file_long,clear

* keep only the observations with a <2014 building, and only the closest
* building if there are multiple
keep if building_built<2014
bys listing_id listing_date: egen double min_dist=min(dist)
keep if min_dist==dist

* winsorize rents and create bedroom/bathroom bins
replace listing_bedrooms=3 if listing_bedrooms>3
replace listing_bathrooms=3 if listing_bathrooms>3
replace listing_bathrooms=ceil(listing_bathrooms)
winsor2 rent lrent, cuts(1 99) replace

* get rid of observations outside of max distance
drop if dist>$max_dist

* get rid of 2012 listings, which are too sparse to use with year FE
drop if listing_year==2012

* make treatment dummies
gen l250=(dist<250)
gen l400=(dist<400)

gen gap=listing_year-building_built

forvalues i = 2013(1)2018{

	gen txy_`i'=l250*(listing_year==`i')

}

* make time trend variable
egen building_id_year=group(building_id listing_year)

* gentrification definitions

gen building_income_g=0
replace building_income_g = 1 if msa=="atlanta" & building_med_hh_inc<66000
replace building_income_g = 1 if msa=="austin" & building_med_hh_inc<74000
replace building_income_g = 1 if msa=="chicago" & building_med_hh_inc<69000
replace building_income_g = 1 if msa=="denver" & building_med_hh_inc<77000
replace building_income_g = 1 if msa=="la" & building_med_hh_inc<70000
replace building_income_g = 1 if msa=="brooklyn" & building_med_hh_inc<76000
replace building_income_g = 1 if msa=="philadelphia" & building_med_hh_inc<69000
replace building_income_g = 1 if msa=="portland" & building_med_hh_inc<72000
replace building_income_g = 1 if msa=="sf" & building_med_hh_inc<96000
replace building_income_g = 1 if msa=="seattle" & building_med_hh_inc<83000
replace building_income_g = 1 if msa=="dc" & building_med_hh_inc<83000

* construct sample indicator, weights, and indicators for whether buildings 
* have any nearby listings
gen building_sample=(inlist(building_built, 2011,2012,2013)>0 ///
	& building_units>=50 & dist>20 & building_income_g==1 ///
	& listing_year>2012 & b_nearby_${iso_dist}_built==0 ///
	& building_pct_student<.25)

bys building_id listing_year: egen building_sample_weight_d=sum(building_sample)
gen building_sample_weight=1/building_sample_weight_d	

forvalues i=2013/2018{

	bys building_id: egen number_in_`i'=sum(listing_year==`i' & building_sample==1)
	
}

egen min_number=rowmin(number_in_*)

forvalues i=2013/2018{

	bys building_id: egen tx250_number_in_`i'=sum(listing_year==`i' & building_sample==1 & l250==1)
	
}

egen tx250_min_number=rowmin(tx250_number_in_*)

forvalues i=2013/2018{

	bys building_id: egen tx256_number_in_`i'=sum(listing_year==`i' & building_sample==1  & inrange(dist,250,$max_dist)>0)
	
}

egen tx256_min_number=rowmin(tx256_number_in_*)

save lre_m${max_dist}_i${iso_dist}_${g}, replace
 
 }
 
end


********************************************************************************
* CREATE DATA SET FOR EMPIRICAL DERIVATIVE FILE

program define emp_derivative 

{

* code uses a wide file as the basis.  First step is to reshape
use zillow_analysis_file_long, clear
drop if dist>800

keep msa listing* rent dist building_id building_built building_units building_tract

sort listing_id listing_date dist
bys listing_id listing_date: egen sub_id = seq()
drop if sub_id>40

rename building_id base_id
reshape wide dist building_* base_id, i(listing_id listing_date msa) j(sub_id) 

rename base_id* building_id*

order msa listing*
order rent listing_bedrooms listing_sqft listing_bathrooms , after(listing_id)
order listing_id

save zillow_analysis_file_wide, replace

* load wide file
use zillow_analysis_file_wide, clear

sort listing_id listing_year listing_month

**** This analysis uses a stricter inclusion criterion than the main analysis. 
* For each new property, we want the ones we deem completely isolated from any new 
* construction out to 600m ***

*** Assert that the first new building in the suffix index is 
* always the closest one to the listing ***
forvalues x=2/38 {
	gen dist_assert1_`x'=(dist1<dist`x')
	tab dist_assert1_`x', m
}
drop if dist_assert1_2==0


*** Now, drop those listings where distance1>600, b/c that means that the closest the 
* listing is to a building is >600m ***
drop if dist1>600

*** Now assert those cases where all other buildings are 600+ meters away ***
forvalues x=2/38 {
	gen dist_assert`x'_GT600m=(dist`x'>600)
}

** Rowtotal these flags to figure out which listings have all other buildings always greater than 600m **
egen count_oth_bldngs_GT600m=rowtotal(dist_assert*_GT600m)

*** Now keep only those buildings (building_id1) where all in-sample listings are <600m from building1 but >600m from any other new building ****
** 37 is maximum number of other nearby new buildings (besides the closest one) a listing has been exposed to in our dataset **
gen dist_assertGT600m_37=(count_oth_bldngs_GT600m==37)
bysort building_id1 (listing_id listing_year listing_month): egen min_dist_assertGT600m_flag=min(dist_assertGT600m_37)
keep if min_dist_assertGT600m_flag==1

drop dist_assert*_GT600m dist_assert1_* 

*** Merge on caracteristics file ***
rename building_id1 	id
merge m:1 id using rca_characteristics.dta, keepusing(msa building_lat building_lon building_med_hh_inc building_pct_college building_poverty_rate building_pct_200k building_pct_white building_pct_black )
drop if _merge==2
drop _merge	
rename id 		building_id1
rename building_lat 	building_lat1
rename building_lon 	building_lon1

* iso and g observations
gen income_g=0
replace income_g = 1 if msa=="atlanta" & building_med_hh_inc<66000
replace income_g = 1 if msa=="austin" & building_med_hh_inc<74000
replace income_g = 1 if msa=="chicago" & building_med_hh_inc<69000
replace income_g = 1 if msa=="denver" & building_med_hh_inc<77000
replace income_g = 1 if msa=="la" & building_med_hh_inc<70000
replace income_g = 1 if msa=="brooklyn" & building_med_hh_inc<76000
replace income_g = 1 if msa=="philadelphia" & building_med_hh_inc<69000
replace income_g = 1 if msa=="portland" & building_med_hh_inc<72000
replace income_g = 1 if msa=="sf" & building_med_hh_inc<96000
replace income_g = 1 if msa=="seattle" & building_med_hh_inc<83000
replace income_g = 1 if msa=="dc" & building_med_hh_inc<83000

gen dense_city=inlist(msa,"chicago","brooklyn")==1
gen rent_control=inlist(msa,"brooklyn","sf","la","dc")==1

keep msa listing* rent listing_bedrooms listing_sqft listing_bathrooms dist1 building_id1 building_built1 building_lat1 building_lon1 building_units1 building_tract1 income_g dense_city rent_control

*save buildings_GT600m_of_all_other_buildings_v2, replace

*use buildings_GT600m_of_all_other_buildings_v2, clear

*** Rename variable ***
unab listing_vars: listing*
local listing_vars="`listing_vars' msa rent listing_bedrooms listing_sqft listing_bathrooms"
local building_vars="dist1 building_id1 building_built1 building_lat1 building_lon1 building_units1 building_tract1"

* rename building vars *
foreach x in `building_vars' {

	if regexm("`x'", "(.*)1$")==1 {
		local ren_var=regexs(1)
	}
	rename `x' `ren_var'
}

order msa building_id building_built building_lat building_lon building_units building_tract dist income_g dense_city rent_control `listing_vars'

outsheet using listing_data_w600m.csv, comma replace 
save listing_data_w600m.dta, replace

	
}

end


*******************************************************************************
* CREATE ALTERNATIVE DATA SAMPLES FOR APPENDIX ROBUSTNESS CHECKS

program define alternative_data_sets

{
	
* near far with no pioneer
global max_dist 600
global iso_dist 250
global g g

{

use zillow_analysis_file_long,clear
gen lrent=log(rent)

* keep only the observations close to a 2015/2016 building, and only the closest
* building if there are multiple
keep if building_built==2015 | building_built==2016 
bys listing_id listing_date: egen double min_dist=min(dist)
keep if min_dist==dist

* winsorize rents and create bedroom/bathroom bins
replace listing_bedrooms=3 if listing_bedrooms>3
replace listing_bathrooms=3 if listing_bathrooms>3
replace listing_bathrooms=ceil(listing_bathrooms)
winsor2 rent lrent, cuts(1 99) replace

* get rid of observations outside of max distance
drop if dist>$max_dist

* get rid of 2012 listings, which are too sparse to use with year FE
drop if listing_year==2012

* make years-to-treatment dummies for both 250m and 400m treatment radius
gen l250=(dist<250)
gen l400=(dist<400)

gen gap=listing_year-building_built

forvalues i=1(1)7{

	gen tx_`i'=0
	gen tx_n`i'=0
	
	gen tx4_`i'=0
	gen tx4_n`i'=0
	
	replace tx_`i'=1 if gap==`i' & l250==1 
	replace tx_n`i'=1 if gap==-`i' & l250==1 

	replace tx4_`i'=1 if gap==`i' & l400==1 
	replace tx4_n`i'=1 if gap==-`i' & l400==1 

}

gen tx_0=0
replace tx_0=1 if gap==0 & l250==1 

gen tx4_0=0
replace tx4_0=1 if gap==0 & l400==1

* bin dummy end points
replace tx_n3=1 if gap<-3 & l250==1
replace tx_3=1 if gap>3 & l250==1
replace tx4_n3=1 if gap<-3 & l400==1
replace tx4_3=1 if gap>3 & l400==1

* create building id time trend
egen building_id_year=group(building_id listing_year)

* use msa median income to create gentrification definition
gen building_income_g=0
replace building_income_g = 1 if msa=="atlanta" & building_med_hh_inc<66000
replace building_income_g = 1 if msa=="austin" & building_med_hh_inc<74000
replace building_income_g = 1 if msa=="chicago" & building_med_hh_inc<69000
replace building_income_g = 1 if msa=="denver" & building_med_hh_inc<77000
replace building_income_g = 1 if msa=="la" & building_med_hh_inc<70000
replace building_income_g = 1 if msa=="brooklyn" & building_med_hh_inc<76000
replace building_income_g = 1 if msa=="philadelphia" & building_med_hh_inc<69000
replace building_income_g = 1 if msa=="portland" & building_med_hh_inc<72000
replace building_income_g = 1 if msa=="sf" & building_med_hh_inc<96000
replace building_income_g = 1 if msa=="seattle" & building_med_hh_inc<83000
replace building_income_g = 1 if msa=="dc" & building_med_hh_inc<83000

* construct sample indicator, weights, and indicators for whether buildings 
* have any nearby listings

if ("$g"=="g"){

	gen building_sample=(inlist(building_built, 2015, 2016)>0 ///
		& building_units>=50 & dist>20 & building_income_g==1 ///
		& listing_year>2012  ///
		& building_pct_student<.25)
		
}

if ("$g"=="ng"){

	gen building_sample=(inlist(building_built, 2015, 2016)>0 ///
		& building_units>=50 & dist>20 ///
		& listing_year>2012  ///
		& building_pct_student<.25)
		
}

bys building_id listing_year: egen building_sample_weight_d=sum(building_sample)
gen building_sample_weight=1/building_sample_weight_d	

forvalues i=2013/2018{

	bys building_id: egen number_in_`i'=sum(listing_year==`i' & building_sample==1)
	
}

egen min_number=rowmin(number_in_*)

	
forvalues i=2013/2018{

	bys building_id: egen tx250_number_in_`i'=sum(listing_year==`i' & building_sample==1 & l250==1)
	
}

egen tx250_min_number=rowmin(tx250_number_in_*)

forvalues i=2013/2018{

	bys building_id: egen tx256_number_in_`i'=sum(listing_year==`i' & building_sample==1  & inrange(dist,250,$max_dist)>0)
	
}

egen tx256_min_number=rowmin(tx256_number_in_*)

forvalues i=2013/2018{

	bys building_id: egen tx400_number_in_`i'=sum(listing_year==`i' & building_sample==1 & l400==1)
	
}

egen tx400_min_number=rowmin(tx400_number_in_*)

forvalues i=2013/2018{

	bys building_id: egen tx46_number_in_`i'=sum(listing_year==`i' & building_sample==1 & inrange(dist,400,$max_dist)>0)
	
}

egen tx46_min_number=rowmin(tx46_number_in_*)

save near_far_np, replace
 
 }
 

* near near with no pioneer
global max_dist 250
global iso_dist 250
global g g

{

use zillow_analysis_file_long,clear
gen lrent=log(rent)

* keep only the observations close to a 2015/2016/2019 building, and only the closest
* building if there are multiple
keep if inlist(building_built,2015,2016,2019)>0
bys listing_id listing_date: egen double min_dist=min(dist)
keep if min_dist==dist

* get rid of observations outside of max distance
keep if dist<$max_dist

* get rid of 2012 listings, which are too sparse to use with year FE
drop if listing_year==2012

* winsorize rents and create bedroom/bathroom bins
replace listing_bedrooms=3 if listing_bedrooms>3
replace listing_bathrooms=3 if listing_bathrooms>3
replace listing_bathrooms=ceil(listing_bathrooms)
winsor2 rent lrent, cuts(1 99) replace

* make years-to-treatment dummies
gen sample=(inrange(building_built,2015,2016)>0)
gen gap=listing_year-building_built

forvalues i=1(1)3{

	gen tx_`i'=0
	gen tx_n`i'=0
	replace tx_`i'=1 if gap==`i' & sample==1 
	replace tx_n`i'=1 if gap==-`i' & sample==1 
	
}

gen tx_0=0
replace tx_0=1 if gap==0 & sample==1

* bin end points and make other useful variables
replace tx_n3 = 1 if gap<-3 & sample==1
replace tx_3 = 1 if gap>3 & sample==1
gen zero=0
gen treated=sample
gen treated_after=treated*(listing_year>=building_built)

* generate time trend variable
egen msa_year=group(msa listing_year)

* gentrification indicator
gen building_income_g=0
replace building_income_g = 1 if msa=="atlanta" & building_med_hh_inc<66000
replace building_income_g = 1 if msa=="austin" & building_med_hh_inc<74000
replace building_income_g = 1 if msa=="chicago" & building_med_hh_inc<69000
replace building_income_g = 1 if msa=="denver" & building_med_hh_inc<77000
replace building_income_g = 1 if msa=="la" & building_med_hh_inc<70000
replace building_income_g = 1 if msa=="brooklyn" & building_med_hh_inc<76000
replace building_income_g = 1 if msa=="philadelphia" & building_med_hh_inc<69000
replace building_income_g = 1 if msa=="portland" & building_med_hh_inc<72000
replace building_income_g = 1 if msa=="sf" & building_med_hh_inc<96000
replace building_income_g = 1 if msa=="seattle" & building_med_hh_inc<83000
replace building_income_g = 1 if msa=="dc" & building_med_hh_inc<83000

* construct sample indicator, weights, and indicators for whether buildings 
* have any nearby listings

if ("$g"=="g"){

	gen building_sample=(inlist(building_built, 2015, 2016,2019)>0 ///
		& building_units>=50 & dist>20 & building_income_g==1 ///
		& listing_year>2012 ///
		& building_pct_student<.25)
		
}

if ("$g"=="ng"){

	gen building_sample=(inlist(building_built, 2015, 2016,2019)>0 ///
		& building_units>=50 & dist>20 ///
		& listing_year>2012  ///
		& building_pct_student<.25)
		
}

bys building_id listing_year: egen building_sample_weight_d=sum(building_sample)
gen building_sample_weight=1/building_sample_weight_d	

forvalues i=2013/2018{

	bys building_id: egen number_in_`i'=sum(listing_year==`i' & building_sample==1)
	
}

egen min_number=rowmin(number_in_*)

save near_near_np, replace

}


* near near with only high income
global max_dist 250
global iso_dist 250

{
	
use zillow_analysis_file_long,clear
gen lrent=log(rent)

* keep only the observations close to a 2015/2016/2019 building, and only the closest
* building if there are multiple
keep if inlist(building_built,2015,2016,2019)>0
bys listing_id listing_date: egen double min_dist=min(dist)
keep if min_dist==dist

* get rid of observations outside of max distance
keep if dist<$max_dist

* get rid of 2012 listings, which are too sparse to use with year FE
drop if listing_year==2012

* winsorize rents and create bedroom/bathroom bins
replace listing_bedrooms=3 if listing_bedrooms>3
replace listing_bathrooms=3 if listing_bathrooms>3
replace listing_bathrooms=ceil(listing_bathrooms)
winsor2 rent lrent, cuts(1 99) replace

* make years-to-treatment dummies
gen sample=(inrange(building_built,2015,2016)>0)
gen gap=listing_year-building_built

forvalues i=1(1)3{

	gen tx_`i'=0
	gen tx_n`i'=0
	replace tx_`i'=1 if gap==`i' & sample==1 
	replace tx_n`i'=1 if gap==-`i' & sample==1 
	
}

gen tx_0=0
replace tx_0=1 if gap==0 & sample==1

* bin end points and make other useful variables
replace tx_n3 = 1 if gap<-3 & sample==1
replace tx_3 = 1 if gap>3 & sample==1
gen zero=0
gen treated=sample
gen treated_after=treated*(listing_year>=building_built)

* generate time trend variable
egen msa_year=group(msa listing_year)

* gentrification indicator
gen building_income_g=0
replace building_income_g = 1 if msa=="atlanta" & building_med_hh_inc<66000
replace building_income_g = 1 if msa=="austin" & building_med_hh_inc<74000
replace building_income_g = 1 if msa=="chicago" & building_med_hh_inc<69000
replace building_income_g = 1 if msa=="denver" & building_med_hh_inc<77000
replace building_income_g = 1 if msa=="la" & building_med_hh_inc<70000
replace building_income_g = 1 if msa=="brooklyn" & building_med_hh_inc<76000
replace building_income_g = 1 if msa=="philadelphia" & building_med_hh_inc<69000
replace building_income_g = 1 if msa=="portland" & building_med_hh_inc<72000
replace building_income_g = 1 if msa=="sf" & building_med_hh_inc<96000
replace building_income_g = 1 if msa=="seattle" & building_med_hh_inc<83000
replace building_income_g = 1 if msa=="dc" & building_med_hh_inc<83000

* construct sample indicator, weights, and indicators for whether buildings 
* have any nearby listings


gen building_sample=(inlist(building_built, 2015, 2016,2019)>0 ///
	& building_units>=50 & dist>20 ///
	& listing_year>2012 & b_nearby_${iso_dist}_built==0 ///
	& building_pct_student<.25 & building_income_g==0)
		


bys building_id listing_year: egen building_sample_weight_d=sum(building_sample)
gen building_sample_weight=1/building_sample_weight_d	

forvalues i=2013/2018{

	bys building_id: egen number_in_`i'=sum(listing_year==`i' & building_sample==1)
	
}

egen min_number=rowmin(number_in_*)


save near_near_hi, replace

}


* near far with only high income
global max_dist 600
global iso_dist 250

{

use zillow_analysis_file_long,clear
gen lrent=log(rent)

* keep only the observations close to a 2015/2016 building, and only the closest
* building if there are multiple
keep if building_built==2015 | building_built==2016 
bys listing_id listing_date: egen double min_dist=min(dist)
keep if min_dist==dist

* winsorize rents and create bedroom/bathroom bins
replace listing_bedrooms=3 if listing_bedrooms>3
replace listing_bathrooms=3 if listing_bathrooms>3
replace listing_bathrooms=ceil(listing_bathrooms)
winsor2 rent lrent, cuts(1 99) replace

* get rid of observations outside of max distance
drop if dist>$max_dist

* get rid of 2012 listings, which are too sparse to use with year FE
drop if listing_year==2012

* make years-to-treatment dummies for both 250m and 400m treatment radius
gen l250=(dist<250)
gen l400=(dist<400)

gen gap=listing_year-building_built

forvalues i=1(1)7{

	gen tx_`i'=0
	gen tx_n`i'=0
	
	gen tx4_`i'=0
	gen tx4_n`i'=0
	
	replace tx_`i'=1 if gap==`i' & l250==1 
	replace tx_n`i'=1 if gap==-`i' & l250==1 

	replace tx4_`i'=1 if gap==`i' & l400==1 
	replace tx4_n`i'=1 if gap==-`i' & l400==1 

}

gen tx_0=0
replace tx_0=1 if gap==0 & l250==1 

gen tx4_0=0
replace tx4_0=1 if gap==0 & l400==1

* bin dummy end points
replace tx_n3=1 if gap<-3 & l250==1
replace tx_3=1 if gap>3 & l250==1
replace tx4_n3=1 if gap<-3 & l400==1
replace tx4_3=1 if gap>3 & l400==1

* create building id time trend
egen building_id_year=group(building_id listing_year)

* use msa median income to create gentrification definition
gen building_income_g=0
replace building_income_g = 1 if msa=="atlanta" & building_med_hh_inc<66000
replace building_income_g = 1 if msa=="austin" & building_med_hh_inc<74000
replace building_income_g = 1 if msa=="chicago" & building_med_hh_inc<69000
replace building_income_g = 1 if msa=="denver" & building_med_hh_inc<77000
replace building_income_g = 1 if msa=="la" & building_med_hh_inc<70000
replace building_income_g = 1 if msa=="brooklyn" & building_med_hh_inc<76000
replace building_income_g = 1 if msa=="philadelphia" & building_med_hh_inc<69000
replace building_income_g = 1 if msa=="portland" & building_med_hh_inc<72000
replace building_income_g = 1 if msa=="sf" & building_med_hh_inc<96000
replace building_income_g = 1 if msa=="seattle" & building_med_hh_inc<83000
replace building_income_g = 1 if msa=="dc" & building_med_hh_inc<83000

* construct sample indicator, weights, and indicators for whether buildings 
* have any nearby listings

gen building_sample=(inlist(building_built, 2015, 2016)>0 ///
	& building_units>=50 & dist>20 & building_income_g==1 ///
	& listing_year>2012 & b_nearby_${iso_dist}_built==1 ///
	& building_pct_student<.25)

bys building_id listing_year: egen building_sample_weight_d=sum(building_sample)
gen building_sample_weight=1/building_sample_weight_d	

forvalues i=2013/2018{

	bys building_id: egen number_in_`i'=sum(listing_year==`i' & building_sample==1)
	
}

egen min_number=rowmin(number_in_*)

	
forvalues i=2013/2018{

	bys building_id: egen tx250_number_in_`i'=sum(listing_year==`i' & building_sample==1 & l250==1)
	
}

egen tx250_min_number=rowmin(tx250_number_in_*)

forvalues i=2013/2018{

	bys building_id: egen tx256_number_in_`i'=sum(listing_year==`i' & building_sample==1  & inrange(dist,250,$max_dist)>0)
	
}

egen tx256_min_number=rowmin(tx256_number_in_*)

forvalues i=2013/2018{

	bys building_id: egen tx400_number_in_`i'=sum(listing_year==`i' & building_sample==1 & l400==1)
	
}

egen tx400_min_number=rowmin(tx400_number_in_*)

forvalues i=2013/2018{

	bys building_id: egen tx46_number_in_`i'=sum(listing_year==`i' & building_sample==1 & inrange(dist,400,$max_dist)>0)
	
}

egen tx46_min_number=rowmin(tx46_number_in_*)

save near_far_hi, replace
 
 }
 
}

end

